suppressPackageStartupMessages(library(tidyverse))
devtools::load_all('~/Google Drive/My Drive/Scripts/R_packages/myUtilities/')
## ℹ Loading myUtilities

Settings

data_dir <- '/Volumes/Mitsu_NGS_3/METTL2A/'

wd <- "~/Google Drive/My Drive/Analysis/METTL2A/"
setwd(wd)

figdir <- paste0(wd, 'Figures/DRS_m3C_sites/Metagene_CDS/')
tabledir <- paste0(wd, 'Tables/DRS_m3C_sites/Metagene_CDS/')

theme_set(
  theme_classic(base_size = 7) +
    theme(legend.position = 'bottom')
)

Functions

paste_wd <- function(path) {
  
  paste0(wd, path)
  
}

read_CDS_annotation_bed <- function() {
  
  read_bed12(
    '/Volumes/Mitsu_NGS_2/METTL2A/Database/gencode.v43.annotation_plus-tRNA_CDS_transcriptome.bed'
  ) |> 
    select(chrom, start, end, thickStart, thickEnd) |> 
    dplyr::rename(transcript_id = chrom)
  
  
}


remove_noCDSinfo_RNAs <- function(df) {
  
  df |> 
    filter(!is.na(thickStart) & !is.na(thickEnd))
  
}

determine_kmer_region <- function(df) {
  
  df |> 
    mutate(
      kmer_region = case_when(
        is.na(thickStart) | is.na(thickEnd) ~ NA,
        kmer_middle <= thickStart ~ 'fiveprimeUTR',
        kmer_middle <= thickEnd   ~ 'CDS',
        kmer_middle <= length     ~ 'threeprimeUTR',
        .default = NA
      )
    )
  
}

calc_percent_m3C_in_region <- function(df) {
  
  df |> 
    group_by(kmer_region, genetype2) |> 
    reframe(num_m3C = n()) |> 
    group_by(genetype2) |> 
    mutate(percent_m3C = 100 * num_m3C / sum(num_m3C))
  
}

calc_region_length <- function(df) {
  
  df |> 
    mutate(
      fiveprimeUTR_length = thickStart,
      CDS_length         = thickEnd - thickStart,
      threeprimeUTR_length = length - thickEnd
    ) |> 
    select(transcript_id, genetype2, ends_with('length')) |> 
    distinct() |> 
    pivot_longer(
      cols = -c(transcript_id, genetype2, length), 
      names_pattern = '(.*)_length', 
      names_to = 'kmer_region', values_to = 'region_length'
    )
}

calc_percent_region_length <- function(df) {
  
  df |> 
    group_by(genetype2, kmer_region) |> 
    reframe(
      sum_length = sum(region_length),
      mean_length = mean(region_length, na.rm = TRUE),
      median_length = median(region_length, na.rm = TRUE),
      max_length = max(region_length, na.rm = TRUE)
    ) |> 
    group_by(genetype2) |> 
    mutate(percent_length = 100 * sum_length / sum(sum_length))
  
}

calc_relposition_within_region <- function(df) {
  
  df |> 
    mutate(
      rel_position_within_region = case_when(
        kmer_region == 'fiveprimeUTR' ~ kmer_middle / thickStart,
        kmer_region == 'CDS' ~ (kmer_middle - thickStart) / (thickEnd - thickStart),
        kmer_region == 'threeprimeUTR' ~ (kmer_middle - thickEnd) / length,
        .default = NA
      )
    ) |> 
    mutate(
      rel_position_metagene = case_when(
        kmer_region == 'fiveprimeUTR' ~ 
          mRNA_length_percentage$fiveprimeUTR * rel_position_within_region,
        kmer_region == 'CDS' ~ 
          mRNA_length_percentage$fiveprimeUTR + 
          mRNA_length_percentage$CDS * rel_position_within_region,
        kmer_region == 'threeprimeUTR' ~ 
          mRNA_length_percentage$fiveprimeUTR + 
          mRNA_length_percentage$CDS +
          mRNA_length_percentage$threeprimeUTR * rel_position_within_region,
        .default = NA
      )
    )
  
}

calc_base_position <- function(df) {
  
  df |> 
    mutate(transcript_seq = str_split(transcript_seq, '')) |> 
    unnest(transcript_seq) |>
    group_by(transcript_id) |> 
    mutate(position = row_number() - min(row_number()) + 1) |> 
    ungroup() |> 
    dplyr::rename(base = transcript_seq)
  
}


calc_CC_position <- function(df) {
  
  df |> 
    mutate(position = str_locate_all(transcript_seq, 'CC')) |> 
    unnest(position) |> 
    mutate(position = (position[,1] + position[,2]) / 2) |> 
    select(transcript_id, position)
  
}

plot_metagene_distribution_different_adjustment <- function(adjust_value) {
  
  # metagene plot
  m3C_relposition_mRNA_metageneplot <- 
    allC_m3C_mRNA_relposition_bound |> 
    ggplot(aes(x = rel_position_metagene, colour = type)) +
    geom_density(adjust = adjust_value, lwd = 1.1) +
    geom_vline(
      xintercept = c(
        mRNA_length_percentage$fiveprimeUTR,
        mRNA_length_percentage$fiveprimeUTR + mRNA_length_percentage$CDS
      ), colour = 'gray20'
    ) +
    geom_hline(yintercept = 0, colour = 'gray20') +
    scale_x_continuous(limits = c(0, 100)) +
    scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
  
  m3C_relposition_mRNA_metageneplot |> 
    ggsave_multiple_formats(
      basename = paste0('m3C_relposition_mRNA_metageneplot_adjust_', adjust_value),
      outdir = figdir, width = 4, height = 4, fontsize = 7
    )
  
  # Near start codon
  
  m3C_relposition_mRNA_nearstartcodon <- 
    allC_m3C_mRNA_relposition_bound |> 
    filter(kmer_region != 'threeprimeUTR') |> 
    ggplot(aes(x = kmer_middle - (thickStart + 1), colour = type)) +
    geom_density(adjust = adjust_value, lwd = 1.1) +
    #scale_x_continuous(limits = c(-500, 1000)) +
    geom_rect(aes(xmin = 0, xmax = 2, ymin = 0, ymax = .002), alpha = 1/2) +
    #geom_vline(xintercept = 0, colour = 'gray20') +
    geom_hline(yintercept = 0, colour = 'gray20') +
    scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))

  m3C_relposition_mRNA_nearstartcodon |> 
    ggsave_multiple_formats(
      basename = paste0('m3C_relposition_mRNA_nearstartcodon_adjust_', adjust_value),
      outdir = figdir, width = 4, height = 4, fontsize = 7
    )
  
  # Near stop codon
  
  m3C_relposition_mRNA_nearstopcodon <- 
    allC_m3C_mRNA_relposition_bound |> 
    filter(kmer_region != 'fiveprimeUTR') |> 
    ggplot(aes(x = kmer_middle - thickEnd, colour = type)) +
    geom_density(adjust = adjust_value, lwd = 1.1) +
    #scale_x_continuous(limits = c(-2000, 2000)) +
    geom_vline(xintercept = 0, colour = 'gray20') +
    geom_hline(yintercept = 0, colour = 'gray20') +
    scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))

  m3C_relposition_mRNA_nearstopcodon |> 
    ggsave_multiple_formats(
      basename = paste0('m3C_relposition_mRNA_nearstopcodon_adjust_', adjust_value),
      outdir = figdir, width = 4, height = 4, fontsize = 7
    )
  
  # 5UTR near start codon
  fiveutr_startcodon <- 
    allC_m3C_mRNA_relposition_bound |> 
    filter(kmer_region == 'fiveprimeUTR') |> 
    ggplot(aes(x = kmer_middle - (thickStart + 1), colour = type)) +
    geom_density(adjust = adjust_value) +
    geom_vline(xintercept = c(-25, -10, 0), colour = 'gray20') +
    geom_hline(yintercept = 0, colour = 'gray20') +
    scale_x_continuous(limits = c(-200, 0)) +
    scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
  
  fiveutr_startcodon |> 
    ggsave_multiple_formats(
      basename = paste0('m3C_relposition_mRNA_5UTR_nearstartcodon_adjust_', adjust_value),
      outdir = figdir, width = 4, height = 4, fontsize = 7
    )
  
  # CDS near start codon
  fiveutr_startcodon <- 
    allC_m3C_mRNA_relposition_bound |> 
    filter(kmer_region == 'CDS') |> 
    ggplot(aes(x = kmer_middle - (thickStart + 1), colour = type)) +
    geom_density(adjust = adjust_value, lwd = 1.1) +
    geom_vline(xintercept = c(0, 80, 350), colour = 'gray') +
    geom_hline(yintercept = 0, colour = 'gray20') +
    scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))

  fiveutr_startcodon |> 
    ggsave_multiple_formats(
      basename = paste0('m3C_relposition_mRNA_CDS_nearstartcodon_adjust_', adjust_value),
      outdir = figdir, width = 4, height = 4, fontsize = 7
    )
  
  # CDS near stop codon
  fiveutr_startcodon <- 
    allC_m3C_mRNA_relposition_bound |> 
    filter(kmer_region == 'CDS') |> 
    ggplot(aes(x = kmer_middle - thickEnd, colour = type)) +
    geom_density(adjust = adjust_value, lwd = 1.1) +
    geom_vline(xintercept = c(0), colour = 'gray') +
    geom_hline(yintercept = 0, colour = 'gray') +
    scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))

  fiveutr_startcodon |> 
    ggsave_multiple_formats(
      basename = paste0('m3C_relposition_mRNA_CDS_nearstopcodon_adjust_', adjust_value),
      outdir = figdir, width = 4, height = 4, fontsize = 7
    )
  
  # 3UTR near stop codon
  fiveutr_startcodon <- 
    allC_m3C_mRNA_relposition_bound |> 
    filter(kmer_region == 'threeprimeUTR') |> 
    ggplot(aes(x = kmer_middle - thickEnd, colour = type)) +
    geom_density(adjust = adjust_value, lwd = 1.1) +
    geom_vline(xintercept = c(0), colour = 'gray') +
    geom_hline(yintercept = 0, colour = 'gray') +
    scale_x_continuous(limits = c(0, 2000)) +
    scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
  
  fiveutr_startcodon |> 
    ggsave_multiple_formats(
      basename = paste0('m3C_relposition_mRNA_3UTR_nearstopcodon_adjust_', adjust_value),
      outdir = figdir, width = 4, height = 4, fontsize = 7
    )
  
}

Read data

Methylated positions

DRS_methylated_positions <- 
  read_tsv(
    'Tables/DRS_m3C_sites/DRS_methylated_positions_relative_range_2024-04-22.tsv' |> 
      paste_wd()
  )
## Rows: 489 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (6): transcript_id, gene_name, seqname, gene_type, ref_kmer, genetype2
## dbl (7): kmer_start, kmer_end, kmer_middle, length, rel_kmer_start, rel_kmer...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
DRS_methylated_positions
## # A tibble: 489 × 13
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCG            58       62
##  5 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCT            76       80
##  6 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ATCAA            94       98
##  7 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      GCCAC           149      153
##  8 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCC           154      158
##  9 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCC           155      159
## 10 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCA           156      160
## # ℹ 479 more rows
## # ℹ 6 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>

CDS annotation

gencode_annotation_CDS <- 
  read_CDS_annotation_bed()
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
gencode_annotation_CDS
## # A tibble: 111,324 × 5
##    transcript_id      start   end thickStart thickEnd
##    <chr>              <dbl> <dbl>      <dbl>    <dbl>
##  1 ENST00000003912.7      0  5481        715     1687
##  2 ENST00000008440.9      0  1667        128      749
##  3 ENST00000009105.5      0  2612        245     1673
##  4 ENST00000010299.10     0  1050         24     1047
##  5 ENST00000011700.10     0 10969          0     9629
##  6 ENST00000054650.9      0  1361        156      873
##  7 ENST00000054666.11     0  2178         88      388
##  8 ENST00000078527.9      0  2066        160     1639
##  9 ENST00000164247.5      0  4273        564     1665
## 10 ENST00000166244.8      0  5019        147     3162
## # ℹ 111,314 more rows

Transcript sequences

espresso_transcript_seqs <- 
  read_tsv(
    'Tables/Database/espresso_AsPC1_transcriptome_seqs_2024-04-22.tsv.gz' |> 
      paste_wd()
  ) |> 
  select(-transcript_length)
## Rows: 36717 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): transcript_id, transcript_seq
## dbl (1): transcript_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
espresso_transcript_seqs
## # A tibble: 36,717 × 2
##    transcript_id      transcript_seq                                            
##    <chr>              <chr>                                                     
##  1 ENST00000339437.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCCGATGTGT…
##  2 ENST00000251607.11 AGCCCGGAAGTGCGCGTGGCGGCGGTGGCGGCTGCGGCAACAGCGGGGCCGATGTGT…
##  3 ENST00000420393.5  CAGCGGGGCCGGTAAGCGGGCGCGCGCCGCTCAGAGGGGCAGAGTTGGTGGCGTGAG…
##  4 ENST00000698415.1  GATGTATGATGAGTTTAGTTGAATGCTCGTGTTGCTGTCTGCTAGCCAAAGACCAAC…
##  5 ENST00000698416.1  CATGACTAGTTTTGTGGGTAGCAATGATGTTTAAATGTCACACACTAACCTTTTTAA…
##  6 ENST00000488263.5  AGGAACTTCATCATGAAGTCTCAAGTAAACGAACATTTTATCTTTCTTGGGATTCTA…
##  7 ENST00000424814.5  GAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCTGCCGCTCCTGCCTGCAG…
##  8 ENST00000231948.9  AGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGCACAACATGGGCAACCACCT…
##  9 ENST00000432408.6  GCCTCCTTTGCGGGTAAACAGACATGGCCGGCGAAGGAGATCAGCAGGACGCTGCGC…
## 10 ENST00000459840.5  ATGGAGGCATTTAAACTGGGACTGAGATGGGACTGAGTGATTAAATTGCTACCAGTG…
## # ℹ 36,707 more rows

List of methylated RNAs

DRS_methylated_RNAs <- 
  DRS_methylated_positions |> 
  select(transcript_id) |> 
  distinct()
DRS_methylated_RNAs
## # A tibble: 71 × 1
##    transcript_id    
##    <chr>            
##  1 ENST00000429711.7
##  2 ENST00000647248.2
##  3 ENST00000389680.2
##  4 ENST00000361390.2
##  5 ENST00000361453.3
##  6 ENST00000387347.2
##  7 ENST00000361624.2
##  8 ENST00000361739.1
##  9 ENST00000361899.2
## 10 ENST00000361227.2
## # ℹ 61 more rows

Annotation of the methylated RNAs

DRS_methylated_RNAs_annotation <- 
  DRS_methylated_positions |> 
  select(starts_with('gene'), starts_with('transcript')) |> 
  distinct()
DRS_methylated_RNAs_annotation
## # A tibble: 71 × 4
##    gene_name gene_type      genetype2 transcript_id    
##    <chr>     <chr>          <chr>     <chr>            
##  1 RPL32     protein_coding mRNA      ENST00000429711.7
##  2 RPL35A    protein_coding mRNA      ENST00000647248.2
##  3 MT-RNR1   Mt_rRNA        Mt_rRNA   ENST00000389680.2
##  4 MT-ND1    protein_coding mt-mRNA   ENST00000361390.2
##  5 MT-ND2    protein_coding mt-mRNA   ENST00000361453.3
##  6 MT-RNR2   Mt_rRNA        Mt_rRNA   ENST00000387347.2
##  7 MT-CO1    protein_coding mt-mRNA   ENST00000361624.2
##  8 MT-CO2    protein_coding mt-mRNA   ENST00000361739.1
##  9 MT-ATP6   protein_coding mt-mRNA   ENST00000361899.2
## 10 MT-ND3    protein_coding mt-mRNA   ENST00000361227.2
## # ℹ 61 more rows

Prepare dataframe of base positions in the methylated RNAs

methylated_RNAs_base_positions <- 
  espresso_transcript_seqs |> 
  right_join(DRS_methylated_RNAs) |> 
  calc_base_position()
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_base_positions
## # A tibble: 101,437 × 3
##    transcript_id     base  position
##    <chr>             <chr>    <dbl>
##  1 ENST00000429711.7 A            1
##  2 ENST00000429711.7 G            2
##  3 ENST00000429711.7 C            3
##  4 ENST00000429711.7 C            4
##  5 ENST00000429711.7 C            5
##  6 ENST00000429711.7 T            6
##  7 ENST00000429711.7 T            7
##  8 ENST00000429711.7 G            8
##  9 ENST00000429711.7 C            9
## 10 ENST00000429711.7 G           10
## # ℹ 101,427 more rows

Extract position of C bases

methylated_RNAs_C_positions <- 
  methylated_RNAs_base_positions |>
  filter(base == 'C')
methylated_RNAs_C_positions
## # A tibble: 24,117 × 3
##    transcript_id     base  position
##    <chr>             <chr>    <dbl>
##  1 ENST00000429711.7 C            3
##  2 ENST00000429711.7 C            4
##  3 ENST00000429711.7 C            5
##  4 ENST00000429711.7 C            9
##  5 ENST00000429711.7 C           11
##  6 ENST00000429711.7 C           13
##  7 ENST00000429711.7 C           14
##  8 ENST00000429711.7 C           16
##  9 ENST00000429711.7 C           17
## 10 ENST00000429711.7 C           20
## # ℹ 24,107 more rows

Join data

Methylated positions

DRS_methylated_positions_CDSpos <- 
  DRS_methylated_positions |> 
  left_join(gencode_annotation_CDS) 
## Joining with `by = join_by(transcript_id)`
DRS_methylated_positions_CDSpos |>
  remove_noCDSinfo_RNAs() |> 
  determine_kmer_region() |> 
  export_tsv(outdir = tabledir, basename = 'DRS_methylated_positions_CDSpos')
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_CDS/DRS_methylated_positions_CDSpos_2024-07-29.tsv
## # A tibble: 436 × 18
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCA            33       37
##  5 ENST00000361390.2 MT-ND1    chrM    protein_cod… CCCCT           123      127
##  6 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCT           141      145
##  7 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCG           186      190
##  8 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCCT           205      209
##  9 ENST00000361390.2 MT-ND1    chrM    protein_cod… CCCCC           260      264
## 10 ENST00000361390.2 MT-ND1    chrM    protein_cod… ACCTC           322      326
## # ℹ 426 more rows
## # ℹ 11 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## #   start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>, kmer_region <chr>

Positions of all C

allC_methylatedRNAs_regioninfo <- 
  methylated_RNAs_C_positions |> 
  left_join(gencode_annotation_CDS) |>
  dplyr::rename(kmer_middle = position) |> 
  mutate(length = end) |> 
  remove_noCDSinfo_RNAs() |> 
  determine_kmer_region() |> 
  left_join(DRS_methylated_RNAs_annotation)
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
allC_methylatedRNAs_regioninfo |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_CDS/allC_methylatedRNAs_regioninfo_2024-07-29.tsv
## # A tibble: 22,334 × 12
##    transcript_id     base  kmer_middle start   end thickStart thickEnd length
##    <chr>             <chr>       <dbl> <dbl> <dbl>      <dbl>    <dbl>  <dbl>
##  1 ENST00000429711.7 C               3     0  2094         77      482   2094
##  2 ENST00000429711.7 C               4     0  2094         77      482   2094
##  3 ENST00000429711.7 C               5     0  2094         77      482   2094
##  4 ENST00000429711.7 C               9     0  2094         77      482   2094
##  5 ENST00000429711.7 C              11     0  2094         77      482   2094
##  6 ENST00000429711.7 C              13     0  2094         77      482   2094
##  7 ENST00000429711.7 C              14     0  2094         77      482   2094
##  8 ENST00000429711.7 C              16     0  2094         77      482   2094
##  9 ENST00000429711.7 C              17     0  2094         77      482   2094
## 10 ENST00000429711.7 C              20     0  2094         77      482   2094
## # ℹ 22,324 more rows
## # ℹ 4 more variables: kmer_region <chr>, gene_name <chr>, gene_type <chr>,
## #   genetype2 <chr>

allC_percentage_groupedby_region <- 
  allC_methylatedRNAs_regioninfo |> 
  group_by(genetype2, kmer_region) |> 
  reframe(n = n())  |> 
  group_by(genetype2) |> 
  mutate(percent_C = 100 * n / sum(n))
allC_percentage_groupedby_region
## # A tibble: 4 × 4
## # Groups:   genetype2 [2]
##   genetype2 kmer_region       n percent_C
##   <chr>     <chr>         <int>     <dbl>
## 1 mRNA      CDS            7490     38.4 
## 2 mRNA      fiveprimeUTR   1845      9.45
## 3 mRNA      threeprimeUTR 10184     52.2 
## 4 mt-mRNA   CDS            2815    100

m3C percentage grouped by region

m3C_percentage_region <- 
  DRS_methylated_positions_CDSpos |> 
  remove_noCDSinfo_RNAs() |> 
  determine_kmer_region() |> 
  calc_percent_m3C_in_region() |> 
  arrange(genetype2)
m3C_percentage_region
## # A tibble: 4 × 4
## # Groups:   genetype2 [2]
##   kmer_region   genetype2 num_m3C percent_m3C
##   <chr>         <chr>       <int>       <dbl>
## 1 CDS           mRNA          179        70.5
## 2 fiveprimeUTR  mRNA           30        11.8
## 3 threeprimeUTR mRNA           45        17.7
## 4 CDS           mt-mRNA       182       100

Percentage of region length

length_percentage_region <- 
  DRS_methylated_positions_CDSpos |> 
  remove_noCDSinfo_RNAs() |> 
  calc_region_length() |> 
  calc_percent_region_length()
length_percentage_region
## # A tibble: 6 × 7
## # Groups:   genetype2 [2]
##   genetype2 kmer_region   sum_length mean_length median_length max_length
##   <chr>     <chr>              <dbl>       <dbl>         <dbl>      <dbl>
## 1 mRNA      CDS                29715     531.             406.       1659
## 2 mRNA      fiveprimeUTR        5341      95.4             70         433
## 3 mRNA      threeprimeUTR      50435     901.             248.       7245
## 4 mt-mRNA   CDS                 8548     950.             956        1542
## 5 mt-mRNA   fiveprimeUTR           0       0                0           0
## 6 mt-mRNA   threeprimeUTR          6       0.667            0           3
## # ℹ 1 more variable: percent_length <dbl>

Join m3C percentage and length

length_m3Csites_percentage_groupedby_region <- 
  full_join(m3C_percentage_region, length_percentage_region) |> 
  full_join(allC_percentage_groupedby_region) |> 
  select(kmer_region, genetype2, percent_m3C, percent_C, percent_length) |> 
  pivot_longer(
    cols = starts_with('percent_'), 
    names_prefix = 'percent_', values_to = 'percent')
## Joining with `by = join_by(kmer_region, genetype2)`
## Joining with `by = join_by(kmer_region, genetype2)`
length_m3Csites_percentage_groupedby_region 
## # A tibble: 18 × 4
## # Groups:   genetype2 [2]
##    kmer_region   genetype2 name    percent
##    <chr>         <chr>     <chr>     <dbl>
##  1 CDS           mRNA      m3C     70.5   
##  2 CDS           mRNA      C       38.4   
##  3 CDS           mRNA      length  34.8   
##  4 fiveprimeUTR  mRNA      m3C     11.8   
##  5 fiveprimeUTR  mRNA      C        9.45  
##  6 fiveprimeUTR  mRNA      length   6.25  
##  7 threeprimeUTR mRNA      m3C     17.7   
##  8 threeprimeUTR mRNA      C       52.2   
##  9 threeprimeUTR mRNA      length  59.0   
## 10 CDS           mt-mRNA   m3C    100     
## 11 CDS           mt-mRNA   C      100     
## 12 CDS           mt-mRNA   length  99.9   
## 13 fiveprimeUTR  mt-mRNA   m3C     NA     
## 14 fiveprimeUTR  mt-mRNA   C       NA     
## 15 fiveprimeUTR  mt-mRNA   length   0     
## 16 threeprimeUTR mt-mRNA   m3C     NA     
## 17 threeprimeUTR mt-mRNA   C       NA     
## 18 threeprimeUTR mt-mRNA   length   0.0701

Prepare percentage of mRNA length

mRNA_length_percentage <- 
  length_percentage_region |> 
  filter(genetype2 == 'mRNA') |> 
  pivot_wider(id_cols = c(genetype2), names_from = kmer_region, values_from = percent_length)
mRNA_length_percentage
## # A tibble: 1 × 4
## # Groups:   genetype2 [1]
##   genetype2   CDS fiveprimeUTR threeprimeUTR
##   <chr>     <dbl>        <dbl>         <dbl>
## 1 mRNA       34.8         6.25          59.0

Relative position of m3C and C

Determine region (UTR or CDS) of the m3C sites

DRS_methylated_positions_CDSpos_regioninfo <- 
  DRS_methylated_positions_CDSpos |> 
  filter(genetype2 == 'mRNA') |> 
  determine_kmer_region()
DRS_methylated_positions_CDSpos_regioninfo
## # A tibble: 257 × 18
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000215754.8 MIF       chr22   protein_cod… GTCCT            79       83
##  5 ENST00000215754.8 MIF       chr22   protein_cod… GCCAC           180      184
##  6 ENST00000215754.8 MIF       chr22   protein_cod… GCCCC           191      195
##  7 ENST00000215754.8 MIF       chr22   protein_cod… ACCCG           484      488
##  8 ENST00000199764.7 CEACAM6   chr19   protein_cod… TTCAG          1698     1702
##  9 ENST00000270625.7 RPS11     chr19   protein_cod… ACCCA           161      165
## 10 ENST00000270625.7 RPS11     chr19   protein_cod… CACCA           473      477
## # ℹ 247 more rows
## # ℹ 11 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## #   start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>, kmer_region <chr>

Relative positions of m3C sites

m3C_relposition_mRNA_metagene <- 
  DRS_methylated_positions_CDSpos_regioninfo |> 
  calc_relposition_within_region()
m3C_relposition_mRNA_metagene
## # A tibble: 257 × 20
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000215754.8 MIF       chr22   protein_cod… GTCCT            79       83
##  5 ENST00000215754.8 MIF       chr22   protein_cod… GCCAC           180      184
##  6 ENST00000215754.8 MIF       chr22   protein_cod… GCCCC           191      195
##  7 ENST00000215754.8 MIF       chr22   protein_cod… ACCCG           484      488
##  8 ENST00000199764.7 CEACAM6   chr19   protein_cod… TTCAG          1698     1702
##  9 ENST00000270625.7 RPS11     chr19   protein_cod… ACCCA           161      165
## 10 ENST00000270625.7 RPS11     chr19   protein_cod… CACCA           473      477
## # ℹ 247 more rows
## # ℹ 13 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## #   start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>,
## #   kmer_region <chr>, rel_position_within_region <dbl>,
## #   rel_position_metagene <dbl>

Relative position of all C bases

allC_methylatedRNAs_regioninfo_relposition <- 
  allC_methylatedRNAs_regioninfo |> 
  calc_relposition_within_region()
allC_methylatedRNAs_regioninfo_relposition
## # A tibble: 22,334 × 14
##    transcript_id     base  kmer_middle start   end thickStart thickEnd length
##    <chr>             <chr>       <dbl> <dbl> <dbl>      <dbl>    <dbl>  <dbl>
##  1 ENST00000429711.7 C               3     0  2094         77      482   2094
##  2 ENST00000429711.7 C               4     0  2094         77      482   2094
##  3 ENST00000429711.7 C               5     0  2094         77      482   2094
##  4 ENST00000429711.7 C               9     0  2094         77      482   2094
##  5 ENST00000429711.7 C              11     0  2094         77      482   2094
##  6 ENST00000429711.7 C              13     0  2094         77      482   2094
##  7 ENST00000429711.7 C              14     0  2094         77      482   2094
##  8 ENST00000429711.7 C              16     0  2094         77      482   2094
##  9 ENST00000429711.7 C              17     0  2094         77      482   2094
## 10 ENST00000429711.7 C              20     0  2094         77      482   2094
## # ℹ 22,324 more rows
## # ℹ 6 more variables: kmer_region <chr>, gene_name <chr>, gene_type <chr>,
## #   genetype2 <chr>, rel_position_within_region <dbl>,
## #   rel_position_metagene <dbl>

CC positions

methylated_RNAs_CC_positions <- 
  espresso_transcript_seqs |> 
  right_join(DRS_methylated_RNAs) |> 
  calc_CC_position() 
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_CC_relpositions <- 
  methylated_RNAs_CC_positions |> 
  left_join(gencode_annotation_CDS) |>
  dplyr::rename(kmer_middle = position) |> 
  mutate(length = end) |> 
  remove_noCDSinfo_RNAs() |> 
  determine_kmer_region() |> 
  left_join(DRS_methylated_RNAs_annotation) |> 
  calc_relposition_within_region() |>
  mutate(type = 'all CC')
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_CC_relpositions
## # A tibble: 5,081 × 14
##    transcript_id  kmer_middle start   end thickStart thickEnd length kmer_region
##    <chr>                <dbl> <dbl> <dbl>      <dbl>    <dbl>  <dbl> <chr>      
##  1 ENST000004297…         3.5     0  2094         77      482   2094 fiveprimeU…
##  2 ENST000004297…        13.5     0  2094         77      482   2094 fiveprimeU…
##  3 ENST000004297…        16.5     0  2094         77      482   2094 fiveprimeU…
##  4 ENST000004297…        20.5     0  2094         77      482   2094 fiveprimeU…
##  5 ENST000004297…        32.5     0  2094         77      482   2094 fiveprimeU…
##  6 ENST000004297…        43.5     0  2094         77      482   2094 fiveprimeU…
##  7 ENST000004297…        59.5     0  2094         77      482   2094 fiveprimeU…
##  8 ENST000004297…        65.5     0  2094         77      482   2094 fiveprimeU…
##  9 ENST000004297…        82.5     0  2094         77      482   2094 CDS        
## 10 ENST000004297…        85.5     0  2094         77      482   2094 CDS        
## # ℹ 5,071 more rows
## # ℹ 6 more variables: gene_name <chr>, gene_type <chr>, genetype2 <chr>,
## #   rel_position_within_region <dbl>, rel_position_metagene <dbl>, type <chr>

Join relative position of all C and m3C sites

allC_m3C_mRNA_relposition_bound <- 
  allC_methylatedRNAs_regioninfo_relposition |> 
  filter(genetype2 == 'mRNA') |> 
  mutate(type = 'allC') |> 
  bind_rows(m3C_relposition_mRNA_metagene |> mutate(type = 'm3C')) |> 
  bind_rows(methylated_RNAs_CC_relpositions) |> 
  filter(genetype2 == 'mRNA') 
allC_m3C_mRNA_relposition_bound |> 
  export_tsv(outdir = tabledir, compression = 'gz')
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_CDS/allC_m3C_mRNA_relposition_bound_2024-07-29.tsv.gz
## # A tibble: 24,144 × 22
##    transcript_id     base  kmer_middle start   end thickStart thickEnd length
##    <chr>             <chr>       <dbl> <dbl> <dbl>      <dbl>    <dbl>  <dbl>
##  1 ENST00000429711.7 C               3     0  2094         77      482   2094
##  2 ENST00000429711.7 C               4     0  2094         77      482   2094
##  3 ENST00000429711.7 C               5     0  2094         77      482   2094
##  4 ENST00000429711.7 C               9     0  2094         77      482   2094
##  5 ENST00000429711.7 C              11     0  2094         77      482   2094
##  6 ENST00000429711.7 C              13     0  2094         77      482   2094
##  7 ENST00000429711.7 C              14     0  2094         77      482   2094
##  8 ENST00000429711.7 C              16     0  2094         77      482   2094
##  9 ENST00000429711.7 C              17     0  2094         77      482   2094
## 10 ENST00000429711.7 C              20     0  2094         77      482   2094
## # ℹ 24,134 more rows
## # ℹ 14 more variables: kmer_region <chr>, gene_name <chr>, gene_type <chr>,
## #   genetype2 <chr>, rel_position_within_region <dbl>,
## #   rel_position_metagene <dbl>, type <chr>, seqname <chr>, ref_kmer <chr>,
## #   kmer_start <dbl>, kmer_end <dbl>, rel_kmer_start <dbl>,
## #   rel_kmer_middle <dbl>, rel_kmer_end <dbl>

Plot

Plot percentage

length_m3Csites_percentage_groupedby_region |> 
  export_tsv(outdir = tabledir)
## 
## Exported to: ~/Google Drive/My Drive/Analysis/METTL2A/Tables/DRS_m3C_sites/Metagene_CDS/length_m3Csites_percentage_groupedby_region_2024-07-29.tsv
## # A tibble: 18 × 4
## # Groups:   genetype2 [2]
##    kmer_region   genetype2 name    percent
##    <chr>         <chr>     <chr>     <dbl>
##  1 CDS           mRNA      m3C     70.5   
##  2 CDS           mRNA      C       38.4   
##  3 CDS           mRNA      length  34.8   
##  4 fiveprimeUTR  mRNA      m3C     11.8   
##  5 fiveprimeUTR  mRNA      C        9.45  
##  6 fiveprimeUTR  mRNA      length   6.25  
##  7 threeprimeUTR mRNA      m3C     17.7   
##  8 threeprimeUTR mRNA      C       52.2   
##  9 threeprimeUTR mRNA      length  59.0   
## 10 CDS           mt-mRNA   m3C    100     
## 11 CDS           mt-mRNA   C      100     
## 12 CDS           mt-mRNA   length  99.9   
## 13 fiveprimeUTR  mt-mRNA   m3C     NA     
## 14 fiveprimeUTR  mt-mRNA   C       NA     
## 15 fiveprimeUTR  mt-mRNA   length   0     
## 16 threeprimeUTR mt-mRNA   m3C     NA     
## 17 threeprimeUTR mt-mRNA   C       NA     
## 18 threeprimeUTR mt-mRNA   length   0.0701
percentage_m3Csites_groupedby_region <- 
  length_m3Csites_percentage_groupedby_region |>
  mutate(
    kmer_region = factor(kmer_region, levels = c('fiveprimeUTR', 'CDS', 'threeprimeUTR')),
    name = factor(name, levels = c('length', 'C', 'm3C'))
  ) |> 
  ggplot(aes(x = name, y = percent, fill = kmer_region)) +
  geom_bar(stat = 'identity') +
  coord_flip() +
  scale_y_reverse() +
  scale_fill_manual(values = c('#90d2d8', '#f6a6b2', '#ffecb8')) +
  facet_wrap( ~ genetype2, ncol = 1, scales = 'free')
percentage_m3Csites_groupedby_region |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 4, height = 4.5, fontsize = 7
  )
## Warning: Removed 4 rows containing missing values (`position_stack()`).
## Removed 4 rows containing missing values (`position_stack()`).
## Removed 4 rows containing missing values (`position_stack()`).
## Removed 4 rows containing missing values (`position_stack()`).
## Removed 4 rows containing missing values (`position_stack()`).

Metagene plots with different adjust values

adjust_value_list <- c(1/10, 1/5, 1/2, 1, 2, 5, 10)
adjust_value_list |> 
  map(plot_metagene_distribution_different_adjustment)
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Removed 3 rows containing non-finite values (`stat_density()`).
## Warning in grid.Call.graphics(C_rect, x$x, x$y, x$width, x$height,
## resolveHJust(x$just, : semi-transparency is not supported on this device:
## reported only once per page
## Warning: Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Removed 203 rows containing non-finite values (`stat_density()`).
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## Removed 3288 rows containing non-finite values (`stat_density()`).
## [[1]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).

## 
## [[2]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).

## 
## [[3]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).

## 
## [[4]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).

## 
## [[5]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).

## 
## [[6]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).

## 
## [[7]]
## Warning: Removed 3288 rows containing non-finite values (`stat_density()`).

plot_metagene_distribution_oneregion_dif_adjust <- function(.region, .adjust_val) {
  
  plot_basename <- paste0('metageneplot_', .region, '_adjust_', .adjust_val)
  
  metageneplot_region <- 
    allC_m3C_mRNA_relposition_bound |> 
    filter(kmer_region == .region) |> 
    ggplot(aes(x = rel_position_within_region, colour = type)) +
    geom_density(adjust = .adjust_val, lwd = 1.1) +
    #  geom_vline(xintercept = c(0), colour = 'gray20') +
    geom_hline(yintercept = 0, colour = 'gray20') +
    scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
  
  metageneplot_region |> 
    ggsave_multiple_formats(
      outdir = figdir, basename = plot_basename,
      width = 4, height = 4, fontsize = 7
    )
  
}


unique_regions <- c("fiveprimeUTR", "CDS", "threeprimeUTR")

map2(unique_regions, 0.5, plot_metagene_distribution_oneregion_dif_adjust) 
## [[1]]

## 
## [[2]]

## 
## [[3]]

allC_m3C_mRNA_relposition_bound |> 
  filter(kmer_region == 'CDS') |> 
  ggplot(aes(x = rel_position_within_region, colour = type)) +
  geom_density(adjust = .5) +
#  geom_vline(xintercept = c(0), colour = 'gray20') +
#  geom_hline(yintercept = 0, colour = 'gray20') +
  scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))

allC_m3C_mRNA_relposition_bound |> 
  filter(kmer_region == 'CDS') |> 
  ggplot(aes(x = kmer_middle - (thickStart + 1), colour = type)) +
  geom_density(adjust = 1/2) +
  geom_vline(xintercept = c(0, 80, 350), colour = 'gray') +
  geom_hline(yintercept = 0, colour = 'gray') +
  scale_color_manual(values = c('gray', 'blue', 'red'))

# allC_m3C_mRNA_relposition_bound |> 
#   filter(kmer_region == 'CDS') |> 
#   ggplot(aes(x = rel_position, colour = type)) +
#   geom_density(adjust = .5) +
#   geom_vline(xintercept = 0, colour = 'gray20') +
#   geom_hline(yintercept = 0, colour = 'gray20') +
#   scale_color_manual(values = c('blue', 'red'))

Region length

m3C_relposition_mRNA_metagene |> 
  group_by((length - thickEnd) < 248) |> 
  reframe(n = n(), num_m3C_3UTR = sum(kmer_region == 'threeprimeUTR'))
## # A tibble: 3 × 3
##   `(length - thickEnd) < 248`     n num_m3C_3UTR
##   <lgl>                       <int>        <int>
## 1 FALSE                         117           36
## 2 TRUE                          137            9
## 3 NA                              3           NA
m3C_relposition_mRNA_metagene |> 
  group_by(thickStart < 70) |> 
  reframe(n = n(), num_m3C_5UTR = sum(kmer_region == 'fiveprimeUTR'))
## # A tibble: 3 × 3
##   `thickStart < 70`     n num_m3C_5UTR
##   <lgl>             <int>        <int>
## 1 FALSE               136           25
## 2 TRUE                118            5
## 3 NA                    3           NA
m3C_relposition_mRNA_metagene |> 
  group_by(thickEnd - thickStart < 406) |> 
  reframe(n = n(), num_m3C_CDS = sum(kmer_region == 'CDS'))
## # A tibble: 3 × 3
##   `thickEnd - thickStart < 406`     n num_m3C_CDS
##   <lgl>                         <int>       <int>
## 1 FALSE                           152         127
## 2 TRUE                            102          52
## 3 NA                                3          NA

Number of sites

allC_m3C_mRNA_relposition_bound |> 
  group_by(type, kmer_region) |> 
  reframe(n = n())
## # A tibble: 10 × 3
##    type   kmer_region       n
##    <chr>  <chr>         <int>
##  1 all CC CDS            1730
##  2 all CC fiveprimeUTR    479
##  3 all CC threeprimeUTR  2159
##  4 allC   CDS            7490
##  5 allC   fiveprimeUTR   1845
##  6 allC   threeprimeUTR 10184
##  7 m3C    CDS             179
##  8 m3C    fiveprimeUTR     30
##  9 m3C    threeprimeUTR    45
## 10 m3C    <NA>              3

Frame

allC_m3C_mRNA_relposition_bound |> 
  filter(type != 'all CC') |> 
  filter(!is.na(kmer_region)) |> 
  mutate(frame = (kmer_middle - (thickEnd + 1) ) %% 3) |> 
  group_by(frame, kmer_region, type, genetype2) |> 
  reframe(n = n()) |> 
  group_by(kmer_region, type, genetype2) |> 
  mutate(percentage = 100 * n /sum(n)) |>
  pivot_wider(
    id_cols = c(kmer_region, type, genetype2), 
    names_from = frame, names_prefix = 'frame_',
    values_from = percentage
  )
## # A tibble: 6 × 6
## # Groups:   kmer_region, type, genetype2 [6]
##   kmer_region   type  genetype2 frame_0 frame_1 frame_2
##   <chr>         <chr> <chr>       <dbl>   <dbl>   <dbl>
## 1 CDS           allC  mRNA         27.3    28.7    44.0
## 2 CDS           m3C   mRNA         20.7    26.3    53.1
## 3 fiveprimeUTR  allC  mRNA         29.1    33.6    37.3
## 4 fiveprimeUTR  m3C   mRNA         26.7    30      43.3
## 5 threeprimeUTR allC  mRNA         32.8    33.7    33.5
## 6 threeprimeUTR m3C   mRNA         28.9    26.7    44.4
allC_m3C_mRNA_relposition_bound
## # A tibble: 24,144 × 22
##    transcript_id     base  kmer_middle start   end thickStart thickEnd length
##    <chr>             <chr>       <dbl> <dbl> <dbl>      <dbl>    <dbl>  <dbl>
##  1 ENST00000429711.7 C               3     0  2094         77      482   2094
##  2 ENST00000429711.7 C               4     0  2094         77      482   2094
##  3 ENST00000429711.7 C               5     0  2094         77      482   2094
##  4 ENST00000429711.7 C               9     0  2094         77      482   2094
##  5 ENST00000429711.7 C              11     0  2094         77      482   2094
##  6 ENST00000429711.7 C              13     0  2094         77      482   2094
##  7 ENST00000429711.7 C              14     0  2094         77      482   2094
##  8 ENST00000429711.7 C              16     0  2094         77      482   2094
##  9 ENST00000429711.7 C              17     0  2094         77      482   2094
## 10 ENST00000429711.7 C              20     0  2094         77      482   2094
## # ℹ 24,134 more rows
## # ℹ 14 more variables: kmer_region <chr>, gene_name <chr>, gene_type <chr>,
## #   genetype2 <chr>, rel_position_within_region <dbl>,
## #   rel_position_metagene <dbl>, type <chr>, seqname <chr>, ref_kmer <chr>,
## #   kmer_start <dbl>, kmer_end <dbl>, rel_kmer_start <dbl>,
## #   rel_kmer_middle <dbl>, rel_kmer_end <dbl>

Codon

codons <- c(
  'TTT', 'TTC', 'TTA', 'TTG', 'TCT', 'TCC', 'TCA', 'TCG', 'TAT', 'TAC', 'TAA', 'TAG', 'TGT', 'TGC', 'TGA', 'TGG', 'CTT', 'CTC', 'CTA', 'CTG', 'CCT', 'CCC', 'CCA', 'CCG', 'CAT', 'CAC', 'CAA', 'CAG', 'CGT', 'CGC', 'CGA', 'CGG', 'ATT', 'ATC', 'ATA', 'ATG', 'ACT', 'ACC', 'ACA', 'ACG', 'AAT', 'AAC', 'AAA', 'AAG', 'AGT', 'AGC', 'AGA', 'AGG', 'GTT', 'GTC', 'GTA', 'GTG', 'GCT', 'GCC', 'GCA', 'GCG', 'GAT', 'GAC', 'GAA', 'GAG', 'GGT', 'GGC', 'GGA', 'GGG'
)

methylated_RNAs_CDSseq <- 
  espresso_transcript_seqs |> 
  right_join(DRS_methylated_RNAs) |> 
  left_join(gencode_annotation_CDS) |> 
  remove_noCDSinfo_RNAs() |> 
  left_join(DRS_methylated_RNAs_annotation) |> 
  mutate(CDS_seq = str_sub(transcript_seq, thickStart + 1, thickEnd))
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
## Joining with `by = join_by(transcript_id)`
methylated_RNAs_CDSseq
## # A tibble: 65 × 10
##    transcript_id     transcript_seq    start   end thickStart thickEnd gene_name
##    <chr>             <chr>             <dbl> <dbl>      <dbl>    <dbl> <chr>    
##  1 ENST00000429711.7 AGCCCTTGCGCGCCAC…     0  2094         77      482 RPL32    
##  2 ENST00000647248.2 CTTCTCTTACCGCCAT…     0  1234         64      394 RPL35A   
##  3 ENST00000361390.2 ATACCCATGGCCAACC…     0   956          0      956 MT-ND1   
##  4 ENST00000361453.3 ATTAATCCCCTGGCCC…     0  1042          0     1042 MT-ND2   
##  5 ENST00000361624.2 ATGTTCGCCGACCGTT…     0  1542          0     1542 MT-CO1   
##  6 ENST00000361739.1 ATGGCACATGCAGCGC…     0   684          0      681 MT-CO2   
##  7 ENST00000361899.2 ATGAACGAAAATCTGT…     0   681          0      678 MT-ATP6  
##  8 ENST00000362079.2 ATGACCCACCAATCAC…     0   784          0      784 MT-CO3   
##  9 ENST00000361227.2 ATAAACTTCGCCTTAA…     0   346          0      346 MT-ND3   
## 10 ENST00000361381.2 ATGCTAAAACTAATCG…     0  1378          0     1378 MT-ND4   
## # ℹ 55 more rows
## # ℹ 3 more variables: gene_type <chr>, genetype2 <chr>, CDS_seq <chr>
min(methylated_RNAs_CDSseq$CDS_seq |> str_length())
## [1] 75
methylated_RNAs_CDSseq %>%
  mutate(codon_list = strsplit(CDS_seq, split = "(?<=.{3})", perl = TRUE)) %>%
  unnest(codon_list) %>%
  group_by(genetype2, codon_list) %>%
  summarise(count = n(), .groups = "drop") %>%
  filter(codon_list %in% codons) |> 
  arrange(-count)
## # A tibble: 121 × 3
##    genetype2 codon_list count
##    <chr>     <chr>      <int>
##  1 mRNA      AAG          627
##  2 mRNA      GAG          372
##  3 mRNA      CTG          365
##  4 mRNA      GCC          360
##  5 mRNA      ATC          314
##  6 mRNA      AAA          310
##  7 mRNA      GCT          290
##  8 mRNA      CAG          288
##  9 mRNA      GGC          288
## 10 mRNA      GTG          288
## # ℹ 111 more rows
DRS_methylated_positions_CDSpos_regioninfo |> 
  mutate(frame = (kmer_middle - thickEnd) %% 3) |> 
  filter(frame == 0) |> 
  mutate(codon = str_sub(ref_kmer, 1,3)) |> 
  group_by(kmer_region, genetype2, codon) |> 
  reframe(n = n()) |> 
  arrange(-n)
## # A tibble: 22 × 4
##    kmer_region   genetype2 codon     n
##    <chr>         <chr>     <chr> <int>
##  1 CDS           mRNA      GCC      46
##  2 CDS           mRNA      ACC      20
##  3 CDS           mRNA      GTC       8
##  4 fiveprimeUTR  mRNA      GCC       7
##  5 CDS           mRNA      ATC       6
##  6 threeprimeUTR mRNA      ACC       6
##  7 fiveprimeUTR  mRNA      ACC       5
##  8 CDS           mRNA      CTC       4
##  9 CDS           mRNA      TCC       4
## 10 threeprimeUTR mRNA      GTC       4
## # ℹ 12 more rows

Why middle of CDS?

m3C_relposition_mRNA_metagene |> 
  filter(kmer_region == 'CDS') |> 
  select(transcript_id, gene_name, kmer_middle, ref_kmer, rel_position_metagene) |> 
  arrange(abs(rel_position_metagene - 50)) |> 
  head(20)
## # A tibble: 20 × 5
##    transcript_id      gene_name kmer_middle ref_kmer rel_position_metagene
##    <chr>              <chr>           <dbl> <chr>                    <dbl>
##  1 ENST00000395566.9  MDK               520 GACTA                     41.0
##  2 ENST00000647248.2  RPL35A            384 CCCCT                     40.0
##  3 ENST00000647248.2  RPL35A            383 ACCCC                     39.8
##  4 ENST00000343262.9  RPS2              865 CTCCG                     39.5
##  5 ENST00000646449.2  RPS26             579 CCCCC                     39.5
##  6 ENST00000270625.7  RPS11             475 CACCA                     39.4
##  7 ENST00000331825.11 FTL               695 ATCTC                     39.1
##  8 ENST00000343262.9  RPS2              852 CCCAC                     39.0
##  9 ENST00000343262.9  RPS2              851 ACCCA                     39.0
## 10 ENST00000343262.9  RPS2              840 ACCTC                     38.6
## 11 ENST00000392514.9  RPLP0             929 GCCAC                     37.4
## 12 ENST00000343262.9  RPS2              808 CACCA                     37.3
## 13 ENST00000646449.2  RPS26             557 ACCTG                     37.3
## 14 ENST00000491306.6  RPL37A            277 GCCAT                     37.2
## 15 ENST00000343262.9  RPS2              806 TTCAC                     37.2
## 16 ENST00000368719.9  S100A6            301 GCCTT                     36.8
## 17 ENST00000343262.9  RPS2              786 ACCTC                     36.4
## 18 ENST00000392514.9  RPLP0             899 GCCTT                     36.3
## 19 ENST00000343262.9  RPS2              780 CCCCC                     36.2
## 20 ENST00000429711.7  RPL32             425 GCCCA                     36.1
gencode_annotation_CDS_blocksizes <- 
  read_bed12(
    '/Volumes/Mitsu_NGS_2/METTL2A/Database/gencode.v43.annotation_plus-tRNA_CDS_transcriptome.bed'
    ) |> 
  select(chrom, start, end, thickStart, thickEnd, blockSizes) |> 
  dplyr::rename(transcript_id = chrom) |> 
  left_join(
    read_tsv('/Volumes/Mitsu_NGS_2/METTL2A/Database/gencode.v43.annotation_plus-tRNA.tsv') |> 
      filter(primary_tag == 'transcript') |> 
      select(transcript_id, strand)
  )  |>
  mutate( # strandが-のとき逆順にする 
    blockSizes = ifelse(
      strand == 1, 
      str_split(blockSizes, ',') |>  map(as.integer),
      str_split(blockSizes, ",") |>  map(~rev(as.integer(.x)))
    )
  ) |> 
  select(-strand)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 3422423 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (25): seq_id, source_tag, primary_tag, score, frame, artif_dupl, ccdsid,...
## dbl  (4): start, end, strand, level
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Joining with `by = join_by(transcript_id)`
gencode_annotation_CDS_blocksizes
## # A tibble: 111,324 × 6
##    transcript_id      start   end thickStart thickEnd blockSizes
##    <chr>              <dbl> <dbl>      <dbl>    <dbl> <list>    
##  1 ENST00000003912.7      0  5481        715     1687 <int [13]>
##  2 ENST00000008440.9      0  1667        128      749 <int [3]> 
##  3 ENST00000009105.5      0  2612        245     1673 <int [13]>
##  4 ENST00000010299.10     0  1050         24     1047 <int [10]>
##  5 ENST00000011700.10     0 10969          0     9629 <int [52]>
##  6 ENST00000054650.9      0  1361        156      873 <int [6]> 
##  7 ENST00000054666.11     0  2178         88      388 <int [5]> 
##  8 ENST00000078527.9      0  2066        160     1639 <int [4]> 
##  9 ENST00000164247.5      0  4273        564     1665 <int [16]>
## 10 ENST00000166244.8      0  5019        147     3162 <int [17]>
## # ℹ 111,314 more rows
calc_distance_from_exon_junction <- function(df) {
  
  df |> 
    left_join(
      gencode_annotation_CDS_blocksizes |> select(transcript_id, blockSizes)
    ) |> 
    filter(!is.na(blockSizes)) |> 
    # mutate(
    #   blockSizes = str_split(blockSizes, ',') |> map(as.integer)
    # ) %>%
    rowwise() %>%
    mutate(
      blockIndex = findInterval(kmer_middle, cumsum(c(0, unlist(blockSizes)))),
      blockStart = cumsum(c(0, unlist(blockSizes)))[blockIndex],
      blockEnd = cumsum(c(0, unlist(blockSizes)))[blockIndex + 1],
      distanceFromStart = kmer_middle - blockStart,
      distanceFromEnd = blockEnd - kmer_middle + 1,
      exonlength = (blockEnd - blockStart),
      relativePosition = distanceFromStart / exonlength 
    ) %>%
    ungroup()
  
}


allC_m3C_mRNA_relposition_bound_exonposition <- 
  allC_m3C_mRNA_relposition_bound |> 
  #m3C_relposition_mRNA_metagene |>
  select(transcript_id, gene_name, end, genetype2, kmer_region, kmer_middle, type) |> 
  calc_distance_from_exon_junction() |> 
  mutate(
    distanceFromStart = ifelse(
      distanceFromStart == kmer_middle, NA, distanceFromStart
    ),
    distanceFromEnd = ifelse(
      distanceFromEnd == end - kmer_middle + 1,
      NA, distanceFromEnd
    )
  ) |>
  rowwise() |> 
  mutate(
    dist_from_exon_junction = min(distanceFromStart, distanceFromEnd, na.rm = TRUE),
  ) |> 
  ungroup() |> 
  filter(!is.na(kmer_region))
## Joining with `by = join_by(transcript_id)`
## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `dist_from_exon_junction = min(distanceFromStart,
##   distanceFromEnd, na.rm = TRUE)`.
## ℹ In row 19566.
## Caused by warning in `min()`:
## ! no non-missing arguments to min; returning Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.
allC_m3C_mRNA_relposition_bound_exonposition
## # A tibble: 24,141 × 16
##    transcript_id     gene_name   end genetype2 kmer_region  kmer_middle type 
##    <chr>             <chr>     <dbl> <chr>     <chr>              <dbl> <chr>
##  1 ENST00000429711.7 RPL32      2094 mRNA      fiveprimeUTR           3 allC 
##  2 ENST00000429711.7 RPL32      2094 mRNA      fiveprimeUTR           4 allC 
##  3 ENST00000429711.7 RPL32      2094 mRNA      fiveprimeUTR           5 allC 
##  4 ENST00000429711.7 RPL32      2094 mRNA      fiveprimeUTR           9 allC 
##  5 ENST00000429711.7 RPL32      2094 mRNA      fiveprimeUTR          11 allC 
##  6 ENST00000429711.7 RPL32      2094 mRNA      fiveprimeUTR          13 allC 
##  7 ENST00000429711.7 RPL32      2094 mRNA      fiveprimeUTR          14 allC 
##  8 ENST00000429711.7 RPL32      2094 mRNA      fiveprimeUTR          16 allC 
##  9 ENST00000429711.7 RPL32      2094 mRNA      fiveprimeUTR          17 allC 
## 10 ENST00000429711.7 RPL32      2094 mRNA      fiveprimeUTR          20 allC 
## # ℹ 24,131 more rows
## # ℹ 9 more variables: blockSizes <list>, blockIndex <int>, blockStart <dbl>,
## #   blockEnd <dbl>, distanceFromStart <dbl>, distanceFromEnd <dbl>,
## #   exonlength <dbl>, relativePosition <dbl>, dist_from_exon_junction <dbl>
unique(allC_m3C_mRNA_relposition_bound_exonposition$type)
## [1] "allC"   "m3C"    "all CC"
unique(allC_m3C_mRNA_relposition_bound_exonposition$kmer_region)
## [1] "fiveprimeUTR"  "CDS"           "threeprimeUTR"

Distance from exon junction

All regions

allC_m3C_mRNA_relposition_bound_exonposition |> 
  rstatix::wilcox_test(dist_from_exon_junction ~ type, p.adjust.method = 'none')
## # A tibble: 3 × 9
##   .y.         group1 group2    n1    n2 statistic        p    p.adj p.adj.signif
## * <chr>       <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
## 1 dist_from_… all CC allC    4368 19519  41485745 5   e- 3 5   e- 3 **          
## 2 dist_from_… all CC m3C     4368   254    789707 6.21e-30 6.21e-30 ****        
## 3 dist_from_… allC   m3C    19519   254   3585067 1.94e-34 1.94e-34 ****
ecdf_distance_from_exonjunction <- 
  allC_m3C_mRNA_relposition_bound_exonposition |> 
  ggplot(aes(x = dist_from_exon_junction + 1, colour = type)) +
  scale_x_log10() +
  stat_ecdf(lwd = 1.1) +
  scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
ecdf_distance_from_exonjunction |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 4, height = 4, fontsize = 7
  )

Grouped by regions

allC_m3C_mRNA_relposition_bound_exonposition |> 
  group_by(kmer_region) |> 
  rstatix::wilcox_test(dist_from_exon_junction ~ type, p.adjust.method = 'none')
## # A tibble: 9 × 10
##   kmer_region   .y.        group1 group2    n1    n2 statistic        p    p.adj
## * <chr>         <chr>      <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl>
## 1 CDS           dist_from… all CC allC    1730  7490  6495480  8.68e- 1 8.68e- 1
## 2 CDS           dist_from… all CC m3C     1730   179   161603  3.35e- 1 3.35e- 1
## 3 CDS           dist_from… allC   m3C     7490   179   697623  3.52e- 1 3.52e- 1
## 4 fiveprimeUTR  dist_from… all CC allC     479  1845   450890  4.91e- 1 4.91e- 1
## 5 fiveprimeUTR  dist_from… all CC m3C      479    30     7746  4.73e- 1 4.73e- 1
## 6 fiveprimeUTR  dist_from… allC   m3C     1845    30    29326. 5.75e- 1 5.75e- 1
## 7 threeprimeUTR dist_from… all CC allC    2159 10184 10809061  2.2 e- 1 2.2 e- 1
## 8 threeprimeUTR dist_from… all CC m3C     2159    45    85392  2.96e-18 2.96e-18
## 9 threeprimeUTR dist_from… allC   m3C    10184    45   406584. 2.77e-19 2.77e-19
## # ℹ 1 more variable: p.adj.signif <chr>
ecdf_distance_from_exonjunction_groupedby_region <- 
  allC_m3C_mRNA_relposition_bound_exonposition |> 
  ggplot(aes(x = dist_from_exon_junction + 1, colour = type)) +
  scale_x_log10() +
  stat_ecdf(lwd = 1.1) +
  facet_wrap( ~ kmer_region, ncol = 1, scales = 'free') +
  scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
ecdf_distance_from_exonjunction_groupedby_region |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 4, height = 9, fontsize = 7
  )

Exon length

All regions

allC_m3C_mRNA_relposition_bound_exonposition |> 
  filter(!is.na(exonlength)) |> 
  rstatix::wilcox_test(exonlength ~ type)
## # A tibble: 3 × 9
##   .y.        group1 group2    n1    n2 statistic        p    p.adj p.adj.signif
## * <chr>      <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl> <chr>       
## 1 exonlength all CC allC    4368 19512 41274680  1   e- 3 1   e- 3 **          
## 2 exonlength all CC m3C     4368   254   779222. 1.77e-27 3.54e-27 ****        
## 3 exonlength allC   m3C    19512   254  3538188. 8.18e-32 2.45e-31 ****
ecdf_exon_length <- 
  allC_m3C_mRNA_relposition_bound_exonposition |>
  filter(!is.na(exonlength)) |> 
  ggplot(aes(x = exonlength + 1, colour = type)) +
  scale_x_log10(limits = c(10, 10000)) +
  stat_ecdf(lwd = 1.1) +
  scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
ecdf_exon_length |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 4, height = 4, fontsize = 7
  )

Grouped by regions

allC_m3C_mRNA_relposition_bound_exonposition |> 
  filter(!is.na(exonlength)) |> 
  group_by(kmer_region) |> 
  rstatix::wilcox_test(exonlength ~ type, p.adjust.method = 'none')
## # A tibble: 9 × 10
##   kmer_region   .y.        group1 group2    n1    n2 statistic        p    p.adj
## * <chr>         <chr>      <chr>  <chr>  <int> <int>     <dbl>    <dbl>    <dbl>
## 1 CDS           exonlength all CC allC    1730  7490  6515654. 7.12e- 1 7.12e- 1
## 2 CDS           exonlength all CC m3C     1730   179   166752. 9   e- 2 9   e- 2
## 3 CDS           exonlength allC   m3C     7490   179   718260  1.02e- 1 1.02e- 1
## 4 fiveprimeUTR  exonlength all CC allC     479  1845   462156. 1.21e- 1 1.21e- 1
## 5 fiveprimeUTR  exonlength all CC m3C      479    30     7561  6.31e- 1 6.31e- 1
## 6 fiveprimeUTR  exonlength allC   m3C     1845    30    27669  9.99e- 1 9.99e- 1
## 7 threeprimeUTR exonlength all CC allC    2159 10177 10647886. 2.4 e- 2 2.4 e- 2
## 8 threeprimeUTR exonlength all CC m3C     2159    45    82104. 1.84e-15 1.84e-15
## 9 threeprimeUTR exonlength allC   m3C    10177    45   392004. 1.31e-16 1.31e-16
## # ℹ 1 more variable: p.adj.signif <chr>
ecdf_exon_length_groupedby_region <- 
  allC_m3C_mRNA_relposition_bound_exonposition |>
  filter(!is.na(exonlength)) |> 
  ggplot(aes(x = exonlength , colour = type)) +
  scale_x_log10(limits = c(10, 10000)) +
  stat_ecdf(lwd = 1.1) +
  facet_wrap( ~ kmer_region, ncol = 1, scales = 'free') +
  scale_color_manual(values = c('#8C8C8C', '#7979D2', '#E60000'))
ecdf_exon_length_groupedby_region |> 
  ggsave_multiple_formats(
    outdir = figdir, width = 4, height = 9, fontsize = 7
  )

allC_m3C_mRNA_relposition_bound_exonposition |>
  filter(type == 'm3C') |> 
  filter(transcript_id == 'ENST00000303204.9') |>
  head(20) |> 
  #filter(grepl('HNRN', gene_name)) |> 
  #View()
  select(
    gene_name, transcript_id, kmer_middle, kmer_region, blockIndex, 
    distanceFromStart, distanceFromEnd
  )
## # A tibble: 1 × 7
##   gene_name transcript_id   kmer_middle kmer_region blockIndex distanceFromStart
##   <chr>     <chr>                 <dbl> <chr>            <int>             <dbl>
## 1 PRELID1   ENST0000030320…         540 CDS                  3                35
## # ℹ 1 more variable: distanceFromEnd <dbl>
m3C_relposition_mRNA_metagene |> 
  filter(grepl('ENST00000552461.5', transcript_id)) 
## # A tibble: 1 × 20
##   transcript_id     gene_name seqname gene_type     ref_kmer kmer_start kmer_end
##   <chr>             <chr>     <chr>   <chr>         <chr>         <dbl>    <dbl>
## 1 ENST00000552461.5 RPLP0     chr12   protein_codi… CCCCA          1921     1925
## # ℹ 13 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## #   start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>,
## #   kmer_region <chr>, rel_position_within_region <dbl>,
## #   rel_position_metagene <dbl>
unique(m3C_relposition_mRNA_metagene$transcript_id)
##  [1] "ENST00000429711.7"    "ENST00000647248.2"    "ENST00000215754.8"   
##  [4] "ENST00000199764.7"    "ENST00000270625.7"    "ENST00000331825.11"  
##  [7] "ENST00000229239.10"   "ENST00000552551.5"    "ENST00000388835.4"   
## [10] "ENST00000646449.2"    "ENST00000501597.3"    "ENST00000202773.14"  
## [13] "ENST00000551150.5"    "ENST00000552461.5"    "ENST00000392514.9"   
## [16] "ESPRESSO:chr2:8790:4" "ENST00000233143.6"    "ENST00000491306.6"   
## [19] "ENST00000358435.9"    "ENST00000468812.6"    "ENST00000646101.2"   
## [22] "ENST00000321153.9"    "ENST00000314138.11"   "ENST00000395566.9"   
## [25] "ENST00000538451.1"    "ENST00000620041.4"    "ENST00000314133.4"   
## [28] "ENST00000273550.12"   "ENST00000274065.9"    "ENST00000009589.8"   
## [31] "ENST00000352983.7"    "ENST00000287038.8"    "ENST00000530705.6"   
## [34] "ENST00000361575.4"    "ENST00000369817.7"    "ENST00000243997.8"   
## [37] "ENST00000343986.9"    "ENST00000260379.11"   "ENST00000274242.10"  
## [40] "ENST00000296674.13"   "ENST00000407193.7"    "ENST00000303204.9"   
## [43] "ENST00000254810.8"    "ENST00000556230.2"    "ENST00000286953.8"   
## [46] "ENST00000361436.10"   "ENST00000323345.11"   "ENST00000343262.9"   
## [49] "ENST00000309268.11"   "ENST00000230050.4"    "ENST00000613865.5"   
## [52] "ENST00000234875.9"    "ENST00000270792.10"   "ENST00000354332.8"   
## [55] "ENST00000368719.9"    "ENST00000368811.8"    "ENST00000368716.9"   
## [58] "ENST00000651669.1"    "ENST00000398752.11"
m3C_relposition_mRNA_metagene |> 
  filter(transcript_id == 'ENST00000368716.9')
## # A tibble: 4 × 20
##   transcript_id     gene_name seqname gene_type     ref_kmer kmer_start kmer_end
##   <chr>             <chr>     <chr>   <chr>         <chr>         <dbl>    <dbl>
## 1 ENST00000368716.9 S100A4    chr1    protein_codi… GTCCA           108      112
## 2 ENST00000368716.9 S100A4    chr1    protein_codi… GCCAT           316      320
## 3 ENST00000368716.9 S100A4    chr1    protein_codi… GCCAG           404      408
## 4 ENST00000368716.9 S100A4    chr1    protein_codi… TTCCA           446      450
## # ℹ 13 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>,
## #   start <dbl>, end <dbl>, thickStart <dbl>, thickEnd <dbl>,
## #   kmer_region <chr>, rel_position_within_region <dbl>,
## #   rel_position_metagene <dbl>
m3C_relposition_mRNA_metagene |> 
  select(transcript_id, gene_name) |> 
  distinct() |> 
  arrange(gene_name) |> 
  export_tsv(basename = 'temp.tsv')
## 
## Exported to: Tables/temp.tsv_2024-07-29.tsv
## # A tibble: 59 × 2
##    transcript_id      gene_name
##    <chr>              <chr>    
##  1 ENST00000646101.2  ARPC1B   
##  2 ENST00000398752.11 ATP5F1A  
##  3 ENST00000243997.8  ATP5F1E  
##  4 ENST00000286953.8  ATP5MJ   
##  5 ENST00000199764.7  CEACAM6  
##  6 ENST00000314133.4  COX8A    
##  7 ENST00000309268.11 EEF1A1   
##  8 ENST00000620041.4  FTH1     
##  9 ENST00000273550.12 FTH1     
## 10 ENST00000331825.11 FTL      
## # ℹ 49 more rows
DRS_methylated_positions
## # A tibble: 489 × 13
##    transcript_id     gene_name seqname gene_type    ref_kmer kmer_start kmer_end
##    <chr>             <chr>     <chr>   <chr>        <chr>         <dbl>    <dbl>
##  1 ENST00000429711.7 RPL32     chr3    protein_cod… GCCCA           423      427
##  2 ENST00000647248.2 RPL35A    chr3    protein_cod… ACCCC           381      385
##  3 ENST00000647248.2 RPL35A    chr3    protein_cod… CCCCT           382      386
##  4 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCG            58       62
##  5 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCT            76       80
##  6 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ATCAA            94       98
##  7 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      GCCAC           149      153
##  8 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      ACCCC           154      158
##  9 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCC           155      159
## 10 ENST00000389680.2 MT-RNR1   chrM    Mt_rRNA      CCCCA           156      160
## # ℹ 479 more rows
## # ℹ 6 more variables: kmer_middle <dbl>, genetype2 <chr>, length <dbl>,
## #   rel_kmer_start <dbl>, rel_kmer_middle <dbl>, rel_kmer_end <dbl>
allC_m3C_mRNA_relposition_bound_exonposition |> 
  filter(type == 'm3C') 
## # A tibble: 254 × 16
##    transcript_id     gene_name   end genetype2 kmer_region   kmer_middle type 
##    <chr>             <chr>     <dbl> <chr>     <chr>               <dbl> <chr>
##  1 ENST00000429711.7 RPL32      2094 mRNA      CDS                   425 m3C  
##  2 ENST00000647248.2 RPL35A     1234 mRNA      CDS                   383 m3C  
##  3 ENST00000647248.2 RPL35A     1234 mRNA      CDS                   384 m3C  
##  4 ENST00000215754.8 MIF         557 mRNA      fiveprimeUTR           81 m3C  
##  5 ENST00000215754.8 MIF         557 mRNA      CDS                   182 m3C  
##  6 ENST00000215754.8 MIF         557 mRNA      CDS                   193 m3C  
##  7 ENST00000215754.8 MIF         557 mRNA      threeprimeUTR         486 m3C  
##  8 ENST00000199764.7 CEACAM6    2594 mRNA      threeprimeUTR        1700 m3C  
##  9 ENST00000270625.7 RPS11       573 mRNA      CDS                   163 m3C  
## 10 ENST00000270625.7 RPS11       573 mRNA      CDS                   475 m3C  
## # ℹ 244 more rows
## # ℹ 9 more variables: blockSizes <list>, blockIndex <int>, blockStart <dbl>,
## #   blockEnd <dbl>, distanceFromStart <dbl>, distanceFromEnd <dbl>,
## #   exonlength <dbl>, relativePosition <dbl>, dist_from_exon_junction <dbl>
DRS_methylated_positions |> 
  mutate(start = kmer_middle - 1, end = kmer_middle) |> 
  select(transcript_id, start, end) |> 
  write_tsv(file = 'Tables/DRS_m3C_sites/m3C_sites.bed' |> paste_wd(), col_names = FALSE)

gencode_annotation <- 
  read_tsv('Tables/Database/gencode.v43.annotation.tsv' |> paste_wd())
## Rows: 3422892 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (24): seq_id, source_tag, primary_tag, score, frame, artif_dupl, ccdsid,...
## dbl  (4): start, end, strand, level
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gencode_annotation
## # A tibble: 3,422,892 × 28
##    seq_id source_tag primary_tag  start    end score strand frame artif_dupl
##    <chr>  <chr>      <chr>        <dbl>  <dbl> <chr>  <dbl> <chr> <chr>     
##  1 chrY   HAVANA     gene        253743 255091 .          1 .     N/A       
##  2 chrY   HAVANA     transcript  253743 255091 .          1 .     N/A       
##  3 chrY   HAVANA     exon        253743 253846 .          1 .     N/A       
##  4 chrY   HAVANA     exon        254937 255091 .          1 .     N/A       
##  5 chrY   HAVANA     gene        276322 303356 .          1 .     N/A       
##  6 chrY   HAVANA     transcript  276322 303353 .          1 .     N/A       
##  7 chrY   HAVANA     exon        276322 276394 .          1 .     N/A       
##  8 chrY   HAVANA     exon        281482 281684 .          1 .     N/A       
##  9 chrY   HAVANA     exon        284167 284314 .          1 .     N/A       
## 10 chrY   HAVANA     exon        288733 288869 .          1 .     N/A       
## # ℹ 3,422,882 more rows
## # ℹ 19 more variables: ccdsid <chr>, exon_id <chr>, exon_number <chr>,
## #   gene_id <chr>, gene_name <chr>, gene_type <chr>, havana_gene <chr>,
## #   havana_transcript <chr>, hgnc_id <chr>, ID <chr>, level <dbl>, ont <chr>,
## #   Parent <chr>, protein_id <chr>, tag <chr>, transcript_id <chr>,
## #   transcript_name <chr>, transcript_support_level <chr>,
## #   transcript_type <chr>
gencode_annotation |> 
#  select(transcript_id) |> 
  right_join(
    read_tsv('Tables/DRS/Positions/common_sig_seqs_in_intensity_up_2024-04-22.tsv.gz' |> paste_wd()) |> 
      filter(grepl('..C..', ref_kmer)) |> 
      select(gene_id) |> 
      distinct(),
    by = join_by(gene_id)
  ) |>
  write_tsv('Tables/Database/gencode.v43.annotation_m3CRNAs.tsv' |> paste_wd())
## Rows: 605 Columns: 65
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (30): transcript_id, transcript_name, ref_kmer, GMM_cov_type_G, cluster_...
## dbl (35): position, GMM_logit_pvalue_G, KS_dwell_pvalue_G, KS_intensity_pval...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.